In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa as librosa
import librosa.display
import os
In [2]:
df=pd.read_csv('note_info.csv')
In [3]:
df.head()
Out[3]:
Unnamed: 0 note_str sample_rate qualities_str instrument_source instrument_family_str instrument_family note instrument_source_str qualities pitch instrument_str instrument velocity
0 keyboard_acoustic_004-060-025 keyboard_acoustic_004-060-025 16000 ['dark', 'reverb'] 0 keyboard 4 278915 acoustic [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] 60 keyboard_acoustic_004 327 25
1 bass_synthetic_033-050-100 bass_synthetic_033-050-100 16000 ['dark'] 2 bass 0 270361 synthetic [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] 50 bass_synthetic_033 417 100
2 bass_synthetic_009-052-050 bass_synthetic_009-052-050 16000 ['bright', 'distortion', 'long_release'] 2 bass 0 270001 synthetic [1, 0, 1, 0, 1, 0, 0, 0, 0, 0] 52 bass_synthetic_009 150 50
3 keyboard_electronic_003-064-127 keyboard_electronic_003-064-127 16000 [] 1 keyboard 4 50978 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 64 keyboard_electronic_003 65 127
4 bass_synthetic_034-030-050 bass_synthetic_034-030-050 16000 ['distortion', 'tempo-synced'] 2 bass 0 265159 synthetic [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] 30 bass_synthetic_034 420 50
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12678 entries, 0 to 12677
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Unnamed: 0             12678 non-null  object
 1   note_str               12678 non-null  object
 2   sample_rate            12678 non-null  int64 
 3   qualities_str          12678 non-null  object
 4   instrument_source      12678 non-null  int64 
 5   instrument_family_str  12678 non-null  object
 6   instrument_family      12678 non-null  int64 
 7   note                   12678 non-null  int64 
 8   instrument_source_str  12678 non-null  object
 9   qualities              12678 non-null  object
 10  pitch                  12678 non-null  int64 
 11  instrument_str         12678 non-null  object
 12  instrument             12678 non-null  int64 
 13  velocity               12678 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 1.4+ MB
In [5]:
len(df['pitch'].unique())
Out[5]:
112
In [6]:
def extract_feature_means(audio_file_path: str) -> pd.DataFrame:
    number_of_mfcc = 20
    y, sr = librosa.load(audio_file_path)
    signal, _ = librosa.effects.trim(y)  
    n_fft = 512  
    hop_length = 256  

    d_audio = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length))

    db_audio = librosa.amplitude_to_db(d_audio, ref=np.max)

    
    s_audio = librosa.feature.melspectrogram(signal, sr=sr)
    s_db_audio = librosa.amplitude_to_db(s_audio, ref=np.max)

    y_harm, y_perc = librosa.effects.hpss(signal)

    spectral_centroids = librosa.feature.spectral_centroid(signal, sr=sr)[0]
    spectral_centroids_delta = librosa.feature.delta(spectral_centroids, mode = 'nearest')
    spectral_centroids_accelerate = librosa.feature.delta(spectral_centroids, order=2, mode = 'nearest')

    hop_length = 256

    chromagram = librosa.feature.chroma_stft(signal, sr=sr, hop_length=hop_length)

    tempo_y, _ = librosa.beat.beat_track(signal, sr=sr)

    spectral_rolloff = librosa.feature.spectral_rolloff(signal, sr=sr)[0]

    onset_env = librosa.onset.onset_strength(y=signal, sr=sr)

    spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(signal, sr=sr)[0]
    spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=3)[0]
    spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=4)[0]

    audio_features = {
        "file_name": audio_file_path,
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(signal)[0]),
        "zero_crossings": np.sum(librosa.zero_crossings(signal, pad=False)),
        "spectrogram": np.mean(db_audio[0]),
        "mel_spectrogram": np.mean(s_db_audio[0]),
        "harmonics": np.mean(y_harm),
        "perceptual_shock_wave": np.mean(y_perc),
        "spectral_centroids": np.mean(spectral_centroids),
        "spectral_centroids_delta": np.mean(spectral_centroids_delta),
        "spectral_centroids_accelerate": np.mean(spectral_centroids_accelerate),
        "chroma1": np.mean(chromagram[0]),
        "chroma2": np.mean(chromagram[1]),
        "chroma3": np.mean(chromagram[2]),
        "chroma4": np.mean(chromagram[3]),
        "chroma5": np.mean(chromagram[4]),
        "chroma6": np.mean(chromagram[5]),
        "chroma7": np.mean(chromagram[6]),
        "chroma8": np.mean(chromagram[7]),
        "chroma9": np.mean(chromagram[8]),
        "chroma10": np.mean(chromagram[9]),
        "chroma11": np.mean(chromagram[10]),
        "chroma12": np.mean(chromagram[11]),
        "tempo_bpm": tempo_y,
        "spectral_rolloff": np.mean(spectral_rolloff),
        "spectral_flux": np.mean(onset_env),
        "spectral_bandwidth_2": np.mean(spectral_bandwidth_2),
        "spectral_bandwidth_3": np.mean(spectral_bandwidth_3),
        "spectral_bandwidth_4": np.mean(spectral_bandwidth_4),
    }

    mfcc_df = extract_mfcc_feature_means(audio_file_path,
                                    signal,
                                    sample_rate=sr,
                                    number_of_mfcc=number_of_mfcc)

    df = pd.DataFrame.from_records(data=[audio_features])

    df = pd.merge(df, mfcc_df, on='file_name')

    return df


def extract_mfcc_feature_means(audio_file_name: str,
                          signal: np.ndarray,
                          sample_rate: int,
                          number_of_mfcc: int) -> pd.DataFrame:

    mfcc_alt = librosa.feature.mfcc(y=signal, sr=sample_rate,
                                    n_mfcc=number_of_mfcc)
    delta = librosa.feature.delta(mfcc_alt)
    accelerate = librosa.feature.delta(mfcc_alt, order=2)

    mfcc_features = {
        "file_name": audio_file_name,
    }

    for i in range(0, number_of_mfcc):
        key_name = "".join(['mfcc', str(i)])
        mfcc_value = np.mean(mfcc_alt[i])
        mfcc_features.update({key_name: mfcc_value})

        key_name = "".join(['mfcc_delta_', str(i)])
        mfcc_value = np.mean(delta[i])
        mfcc_features.update({key_name: mfcc_value})

        key_name = "".join(['mfcc_accelerate_', str(i)])
        mfcc_value = np.mean(accelerate[i])
        mfcc_features.update({key_name: mfcc_value})

    df = pd.DataFrame.from_records(data=[mfcc_features])
    return df
In [7]:
path = r"C:\Users\ksivi\Desktop\New folder\nsynth-valid\audio"
dir_list = os.listdir(path)
In [8]:
len(dir_list)
Out[8]:
12676
In [ ]:
info = []

for i in dir_list:
    try:
        data = extract_feature_means("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+i)
    except:
        continue
    values = data.values
    info.append(values[0])
In [11]:
df_2 = extract_feature_means('bass_electronic_018-022-025.wav')
In [12]:
df_2.head()
Out[12]:
file_name zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate ... mfcc_accelerate_16 mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19
0 bass_electronic_018-022-025.wav 0.210125 15742 -66.413757 -77.642357 0.000006 -0.000673 2671.302747 15.874826 -0.008433 ... 0.046337 -0.739872 -0.046959 0.051725 -0.290447 -0.08219 0.018868 2.694299 -0.056343 -0.000569

1 rows × 88 columns

In [13]:
columns = df_2.columns
In [ ]:
va = pd.DataFrame(data=info, columns = columns)
In [ ]:
va['name'] = va['name'].str[:-4]
In [ ]:
df['note_str'] = df['note_str'].astype(str)
In [ ]:
samples = df.merge(va,left_on='note_str', right_on='name',how='right')
In [15]:
samples['instrument_family_str'].value_counts()
Out[15]:
bass        2634
keyboard    2403
guitar      2070
organ       1598
brass        886
string       814
reed         720
mallet       663
flute        470
vocal        404
Name: instrument_family_str, dtype: int64
In [16]:
len(samples)
Out[16]:
12662
In [17]:
plt.figure(figsize=(20,4))
x, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
librosa.display.waveshow(y=x,sr=sr)
plt.title("Mallet Audio Wave")
plt.xlim(-0.1,2);
plt.savefig('mallet_wave.png')
In [18]:
y, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True)
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time',sr=sr, ax=ax[0])
ax[0].set(title=dir_list[455])
ax[0].label_outer()
hop_length = 1024
D = librosa.amplitude_to_db(np.abs(librosa.stft(y, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D, y_axis='log', sr=sr, hop_length=hop_length,x_axis='time', ax=ax[1])
ax[1].set(title='Log-frequency power spectrogram')
ax[1].label_outer()
fig.colorbar(img, ax=ax, format="%+2.f dB")
ax[0].set_xlim(0,2)
ax[1].set_xlim(0,2);
plt.savefig('bass_spec.png')
In [19]:
y1, sr1 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[4002])
y2, sr2 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr1, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr2, ax=axs[0,1])

axs[0,0].set_title(dir_list[4002], fontsize=20)
axs[0,1].set_title(dir_list[455], fontsize=20)
hop_length = 1024
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D1, y_axis='log', sr=sr1, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D2, y_axis='log', sr=sr2, hop_length=hop_length,x_axis='time', ax=axs[1,1])

axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')
fig.colorbar(img, ax=axs, format="%+2.f dB")
plt.setp(axs,xlim=(0,2));
plt.savefig('guitar_bass_spec.png')
In [20]:
y3, sr3 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
y4, sr4 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[7481])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y3)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr3, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y4)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr4, ax=axs[0,1])

axs[0,0].set_title(dir_list[8512], fontsize=20)
axs[0,1].set_title(dir_list[7481],fontsize=20)
hop_length = 1024

D3 = librosa.amplitude_to_db(np.abs(librosa.stft(y3, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D3, y_axis='log', sr=sr3, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D4 = librosa.amplitude_to_db(np.abs(librosa.stft(y4, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D4, y_axis='log', sr=sr4, hop_length=hop_length,x_axis='time', ax=axs[1,1])
axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')

fig.colorbar(img, ax=axs, format="%+2.f dB")
plt.setp(axs,xlim=(0,2));
plt.savefig('mallet_keyboard_spec.png')
In [21]:
plt.figure(figsize=(16,16), dpi=200)
sns.scatterplot(data=samples,x='spectral_centroids', y='zero_crossing_rate', hue = 'pitch',palette ='viridis')
plt.title('Zero Crossing Rate vs Spetral Centroids')
plt.xlabel('Spectral Centroids')
plt.ylabel('Zero Crossing Rate')
plt.savefig('cent_zero_cross_scatter.png')
In [22]:
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc3')
plt.xlabel('Instrument')
plt.ylabel('MFCC 3');
plt.title('MFCC 3 by Instrument')
plt.savefig('mfcc3_inst.png')
In [23]:
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc8')
plt.xlabel('Instrument')
plt.title('MFCC 8 by Instrument')
plt.ylabel('MFCC 8');
plt.savefig('mfcc8_inst.png')
In [24]:
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'spectral_bandwidth_2')
plt.xlabel('Instrument')
plt.ylabel('spectral_bandwidth')
plt.title('Spectral Bandwidth by Instrument');
plt.savefig('spec_band.png')
In [25]:
sns.pairplot(data=samples, vars = ['mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','spectral_bandwidth_3'],hue='pitch', palette='viridis')
plt.title('Pair Plot of Various MFCCs by Pitch')
plt.savefig('pairplot.png')
In [26]:
samples.head()
Out[26]:
Unnamed: 0 note_str sample_rate qualities_str instrument_source instrument_family_str instrument_family note instrument_source_str qualities ... mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19 name
0 0 bass_electronic_018-022-050 16000 ['percussive'] 1 bass 0 277009 electronic [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] ... -0.739872 -0.046959 0.051725 -0.290447 -0.082190 0.018868 2.694299 -0.056343 -0.000569 bass_electronic_018-022-050
1 1 bass_electronic_018-022-127 16000 ['fast_decay', 'percussive'] 1 bass 0 223304 electronic [0, 0, 0, 1, 0, 0, 0, 1, 0, 0] ... -0.969568 0.028864 -0.215973 -2.457192 -0.140675 0.338459 -0.499463 -0.884398 0.581262 bass_electronic_018-022-127
2 2 bass_electronic_018-023-050 16000 [] 1 bass 0 222626 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 1.079908 -0.011627 0.000507 0.035780 0.013179 0.005179 -0.784336 0.031024 -0.011306 bass_electronic_018-023-050
3 3 bass_electronic_018-023-100 16000 [] 1 bass 0 230338 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 1.711984 0.009815 0.006658 0.741615 -0.012899 0.003196 -0.122626 0.010039 -0.027255 bass_electronic_018-023-100
4 4 bass_electronic_018-024-050 16000 [] 1 bass 0 284868 electronic [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ... 1.546056 0.016031 -0.002961 0.683596 -0.000470 0.006038 -0.126844 -0.017591 -0.018448 bass_electronic_018-024-050

5 rows × 103 columns

In [27]:
samples = samples.drop(['note_str', 'instrument_source', 'instrument_family','sample_rate', 'qualities_str', 'name','instrument_source_str', 'qualities', 'Unnamed: 0','instrument_str', 'instrument','velocity','file_name','note'], axis = 1)
In [134]:
note=samples[samples['pitch'] > 21]
In [135]:
note=note[note['pitch']<108]
In [136]:
inst = note.drop('pitch', axis=1)
In [137]:
note=note.drop('instrument_family_str',axis=1)
In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
In [132]:
samples['instrument_family_str'].unique()
Out[132]:
array(['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ',
       'reed', 'string', 'vocal'], dtype=object)

Instrument Identification

In [138]:
X=inst.drop('instrument_family_str', axis=1)
y=inst['instrument_family_str']
In [139]:
scaler = StandardScaler()
In [140]:
X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(X, y, test_size=0.15, random_state=101)
In [141]:
scaled_X_train_inst = scaler.fit_transform(X_train_inst)
scaled_X_test_inst = scaler.transform(X_test_inst)
In [42]:
log_model=LogisticRegression(solver='saga', multi_class='ovr', max_iter=10000)
In [45]:
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
In [46]:
grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty})
In [49]:
grid_model.fit(scaled_X_train_inst,y_train_inst)
Out[49]:
GridSearchCV(estimator=LogisticRegression(max_iter=10000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']})
In [50]:
y_preds = grid_model.predict(scaled_X_test_inst)
In [51]:
from sklearn.metrics import classification_report, plot_confusion_matrix
In [53]:
print(classification_report(y_preds,y_test_inst))
              precision    recall  f1-score   support

        bass       0.83      0.77      0.80       422
       brass       0.93      0.84      0.88       140
       flute       0.72      0.72      0.72        80
      guitar       0.71      0.72      0.71       315
    keyboard       0.74      0.78      0.76       334
      mallet       0.63      0.83      0.72        81
       organ       0.95      0.91      0.93       274
        reed       0.94      1.00      0.97        94
      string       0.86      0.85      0.86       113
       vocal       0.94      0.96      0.95        47

    accuracy                           0.81      1900
   macro avg       0.83      0.84      0.83      1900
weighted avg       0.82      0.81      0.81      1900

In [54]:
plt.figure(figsize=(14,14))
plot_confusion_matrix(grid_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation=90);
<Figure size 1008x1008 with 0 Axes>
In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [142]:
test_error_rates = []


for k in range(1,100):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train_inst,y_train_inst) 
   
    y_pred_test = knn_model.predict(scaled_X_test_inst)
    
    test_error = 1 - accuracy_score(y_test_inst,y_pred_test)
    test_error_rates.append(test_error)
In [143]:
plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,100),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")
Out[143]:
Text(0.5, 0, 'K Value')
In [144]:
KNN_model = KNeighborsClassifier(n_neighbors=1)
In [145]:
KNN_model.fit(scaled_X_train_inst,y_train_inst) 
   
y_pred_test = KNN_model.predict(scaled_X_test_inst)
In [146]:
print(classification_report(y_pred_test,y_test_inst))
              precision    recall  f1-score   support

        bass       0.99      0.96      0.98       390
       brass       0.98      0.97      0.97       129
       flute       1.00      0.98      0.99        89
      guitar       0.97      0.96      0.96       286
    keyboard       0.97      0.99      0.98       355
      mallet       0.97      0.99      0.98        98
       organ       0.98      1.00      0.99       226
        reed       0.98      1.00      0.99        96
      string       0.99      0.99      0.99       137
       vocal       1.00      1.00      1.00        50

    accuracy                           0.98      1856
   macro avg       0.98      0.98      0.98      1856
weighted avg       0.98      0.98      0.98      1856

In [147]:
from sklearn.ensemble import RandomForestClassifier
In [148]:
rand_model = RandomForestClassifier()
In [149]:
rand_model.fit(scaled_X_train_inst,y_train_inst)
Out[149]:
RandomForestClassifier()
In [150]:
preds_inst = rand_model.predict(scaled_X_test_inst)
In [151]:
print(classification_report(preds_inst,y_test_inst))
              precision    recall  f1-score   support

        bass       1.00      1.00      1.00       382
       brass       1.00      1.00      1.00       128
       flute       1.00      1.00      1.00        87
      guitar       0.99      1.00      0.99       281
    keyboard       1.00      0.99      1.00       363
      mallet       1.00      1.00      1.00       100
       organ       1.00      1.00      1.00       231
        reed       1.00      1.00      1.00        98
      string       1.00      1.00      1.00       136
       vocal       1.00      1.00      1.00        50

    accuracy                           1.00      1856
   macro avg       1.00      1.00      1.00      1856
weighted avg       1.00      1.00      1.00      1856

In [152]:
plot_confusion_matrix(rand_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation = 90);

Note Identification

In [73]:
note.head()
Out[73]:
pitch zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate ... mfcc_accelerate_16 mfcc17 mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19
0 22 0.210125 15742 -66.413757 -77.642357 0.000006 -0.000673 2671.302747 15.874826 -0.008433 ... 0.046337 -0.739872 -0.046959 0.051725 -0.290447 -0.082190 0.018868 2.694299 -0.056343 -0.000569
1 22 0.062500 459 -35.409962 -52.084045 0.001413 -0.001678 667.165083 136.698995 35.325723 ... -0.275118 -0.969568 0.028864 -0.215973 -2.457192 -0.140675 0.338459 -0.499463 -0.884398 0.581262
2 23 0.008180 589 -25.864546 -30.828018 0.000110 0.000114 129.129320 6.599145 0.871162 ... 0.013104 1.079908 -0.011627 0.000507 0.035780 0.013179 0.005179 -0.784336 0.031024 -0.011306
3 23 0.007953 562 -27.519205 -33.968998 0.000027 0.000153 120.454945 5.414388 1.140152 ... 0.006131 1.711984 0.009815 0.006658 0.741615 -0.012899 0.003196 -0.122626 0.010039 -0.027255
4 24 0.009085 646 -26.927103 -31.652479 0.000025 0.000088 127.136842 5.570173 0.700538 ... 0.010970 1.546056 0.016031 -0.002961 0.683596 -0.000470 0.006038 -0.126844 -0.017591 -0.018448

5 rows × 88 columns

In [74]:
X = note.drop('pitch', axis=1)
y=note['pitch']
In [75]:
len(y)
Out[75]:
12373
In [95]:
X_train_note, X_test_note, y_train_note, y_test_note = train_test_split(X, y, test_size=0.15, random_state=101)
In [96]:
rand_model2 = RandomForestClassifier()
In [97]:
rand_model2.fit(X_train_note,y_train_note)
Out[97]:
RandomForestClassifier()
In [98]:
preds = rand_model2.predict(X_test_note)
In [99]:
print(classification_report(preds,y_test_note))
              precision    recall  f1-score   support

          22       0.80      0.67      0.73        12
          23       0.71      0.75      0.73        16
          24       0.63      0.85      0.72        20
          25       0.70      0.88      0.78        16
          26       1.00      0.71      0.83        14
          27       0.87      1.00      0.93        20
          28       0.83      0.75      0.79        20
          29       0.86      0.83      0.85        30
          30       0.87      0.87      0.87        15
          31       0.89      0.85      0.87        20
          32       0.87      0.96      0.92        28
          33       0.96      0.86      0.91        28
          34       0.96      0.92      0.94        26
          35       0.76      1.00      0.86        16
          36       0.88      0.79      0.83        19
          37       0.85      0.79      0.82        29
          38       0.88      0.85      0.86        33
          39       0.90      0.84      0.87        32
          40       0.93      0.87      0.90        30
          41       0.90      0.93      0.92        30
          42       0.81      0.92      0.86        24
          43       1.00      0.87      0.93        30
          44       0.88      0.81      0.85        27
          45       0.87      0.95      0.91        21
          46       0.93      0.90      0.91        29
          47       0.94      0.83      0.88        18
          48       0.95      1.00      0.97        18
          49       0.89      0.92      0.91        26
          50       0.91      0.94      0.92        31
          51       0.89      0.97      0.93        32
          52       0.94      0.97      0.95        32
          53       0.93      0.93      0.93        27
          54       1.00      0.97      0.98        29
          55       0.97      0.97      0.97        33
          56       0.97      0.95      0.96        38
          57       0.97      0.94      0.95        31
          58       0.88      0.96      0.92        23
          59       0.96      1.00      0.98        26
          60       1.00      0.93      0.97        30
          61       0.96      0.93      0.94        27
          62       1.00      0.95      0.98        22
          63       1.00      1.00      1.00        28
          64       1.00      0.96      0.98        27
          65       1.00      1.00      1.00        18
          66       1.00      1.00      1.00        25
          67       1.00      1.00      1.00        22
          68       1.00      1.00      1.00        21
          69       0.92      1.00      0.96        22
          70       1.00      0.96      0.98        23
          71       1.00      0.80      0.89        20
          72       1.00      1.00      1.00        20
          73       1.00      0.93      0.97        15
          74       0.95      1.00      0.97        18
          75       0.96      0.96      0.96        24
          76       0.92      1.00      0.96        23
          77       1.00      1.00      1.00        18
          78       0.94      0.94      0.94        17
          79       1.00      1.00      1.00        30
          80       1.00      1.00      1.00        15
          81       0.96      0.92      0.94        26
          82       0.93      0.93      0.93        15
          83       0.88      1.00      0.93        21
          84       1.00      0.88      0.93        24
          85       0.82      1.00      0.90        14
          86       0.94      0.94      0.94        18
          87       0.95      0.91      0.93        22
          88       0.96      0.96      0.96        24
          89       0.95      1.00      0.98        20
          90       0.82      0.90      0.86        10
          91       0.87      0.87      0.87        15
          92       0.93      1.00      0.97        14
          93       0.88      0.93      0.90        15
          94       0.83      1.00      0.91        15
          95       0.92      1.00      0.96        11
          96       1.00      0.93      0.97        15
          97       0.89      0.94      0.91        17
          98       1.00      1.00      1.00        15
          99       1.00      1.00      1.00        13
         100       0.82      0.90      0.86        10
         101       0.91      0.77      0.83        13
         102       0.94      0.58      0.71        26
         103       1.00      1.00      1.00        11
         104       1.00      1.00      1.00        14
         105       0.86      1.00      0.92        12
         106       0.82      0.93      0.87        15
         107       0.88      0.88      0.88        17

    accuracy                           0.92      1856
   macro avg       0.92      0.92      0.92      1856
weighted avg       0.93      0.92      0.92      1856

In [100]:
from sklearn.model_selection import GridSearchCV
In [108]:
param_grid = {"n_estimators":[100,150,200,250],'max_depth':[6,10,14,20,25]}
In [109]:
rand_model3 = RandomForestClassifier()
In [110]:
grid = GridSearchCV(rand_model3,param_grid)
In [111]:
grid.fit(X_train_note,y_train_note)
Out[111]:
GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [6, 10, 14, 20, 25],
                         'n_estimators': [100, 150, 200, 250]})
In [112]:
grid.best_params_
Out[112]:
{'max_depth': 25, 'n_estimators': 150}
In [113]:
preds = grid.predict(X_test_note)
In [114]:
print(classification_report(preds,y_test_note))
              precision    recall  f1-score   support

          22       0.90      0.75      0.82        12
          23       0.71      0.86      0.77        14
          24       0.59      0.84      0.70        19
          25       0.80      0.84      0.82        19
          26       1.00      0.77      0.87        13
          27       0.91      0.95      0.93        22
          28       0.89      0.80      0.84        20
          29       0.79      0.88      0.84        26
          30       0.87      0.76      0.81        17
          31       0.95      0.90      0.92        20
          32       0.94      0.97      0.95        30
          33       1.00      0.89      0.94        28
          34       0.88      0.92      0.90        24
          35       0.76      0.89      0.82        18
          36       0.94      0.80      0.86        20
          37       0.96      0.87      0.91        30
          38       0.88      0.90      0.89        31
          39       0.87      0.87      0.87        30
          40       0.93      0.96      0.95        27
          41       0.94      0.91      0.92        32
          42       0.81      1.00      0.90        22
          43       1.00      0.90      0.95        29
          44       0.88      0.88      0.88        25
          45       0.87      0.91      0.89        22
          46       0.89      0.86      0.88        29
          47       0.94      0.79      0.86        19
          48       0.95      1.00      0.97        18
          49       0.89      0.92      0.91        26
          50       0.91      0.97      0.94        30
          51       0.94      0.94      0.94        35
          52       0.97      0.97      0.97        33
          53       0.89      0.89      0.89        27
          54       1.00      0.97      0.98        29
          55       0.97      0.97      0.97        33
          56       0.95      0.95      0.95        37
          57       1.00      0.94      0.97        32
          58       0.88      0.96      0.92        23
          59       1.00      0.96      0.98        28
          60       1.00      0.90      0.95        31
          61       0.96      0.96      0.96        26
          62       1.00      0.91      0.95        23
          63       1.00      1.00      1.00        28
          64       1.00      0.96      0.98        27
          65       1.00      1.00      1.00        18
          66       1.00      1.00      1.00        25
          67       1.00      1.00      1.00        22
          68       0.95      1.00      0.98        20
          69       0.92      1.00      0.96        22
          70       1.00      0.92      0.96        24
          71       1.00      0.84      0.91        19
          72       1.00      1.00      1.00        20
          73       1.00      1.00      1.00        14
          74       1.00      1.00      1.00        19
          75       0.96      0.96      0.96        24
          76       0.92      1.00      0.96        23
          77       1.00      1.00      1.00        18
          78       0.94      1.00      0.97        16
          79       1.00      1.00      1.00        30
          80       1.00      1.00      1.00        15
          81       0.96      0.96      0.96        25
          82       1.00      1.00      1.00        15
          83       0.88      1.00      0.93        21
          84       1.00      0.91      0.95        23
          85       0.88      1.00      0.94        15
          86       0.94      0.89      0.92        19
          87       0.95      0.95      0.95        21
          88       0.96      1.00      0.98        23
          89       0.95      1.00      0.98        20
          90       0.82      0.90      0.86        10
          91       0.87      0.93      0.90        14
          92       1.00      1.00      1.00        15
          93       0.88      0.88      0.88        16
          94       0.78      1.00      0.88        14
          95       0.92      1.00      0.96        11
          96       1.00      0.93      0.97        15
          97       0.89      1.00      0.94        16
          98       1.00      1.00      1.00        15
          99       1.00      1.00      1.00        13
         100       0.82      0.82      0.82        11
         101       0.91      0.77      0.83        13
         102       0.94      0.60      0.73        25
         103       1.00      0.73      0.85        15
         104       1.00      1.00      1.00        14
         105       0.86      0.92      0.89        13
         106       0.76      0.93      0.84        14
         107       0.88      0.88      0.88        17

    accuracy                           0.93      1856
   macro avg       0.93      0.93      0.92      1856
weighted avg       0.93      0.93      0.93      1856

In [119]:
X_test_note['Note'] = preds
In [153]:
X_test_note['Instrument'] = preds_inst
In [158]:
X_test_note.head()
Out[158]:
zero_crossing_rate zero_crossings spectrogram mel_spectrogram harmonics perceptual_shock_wave spectral_centroids spectral_centroids_delta spectral_centroids_accelerate chroma1 ... mfcc_delta_17 mfcc_accelerate_17 mfcc18 mfcc_delta_18 mfcc_accelerate_18 mfcc19 mfcc_delta_19 mfcc_accelerate_19 Note Instrument
7829 0.008167 599 -17.000275 -24.065882 -2.391993e-05 0.000028 125.909052 3.842943 0.748480 0.122843 ... 0.074348 -0.021655 -2.285110 0.061437 -0.010873 5.484561 0.001235 -0.002155 30 keyboard
2043 0.018063 1385 -65.334007 -78.159966 3.126499e-05 -0.000030 252.379844 7.212287 1.285274 0.009431 ... -0.035811 0.022589 -11.201360 -0.012952 0.028557 -11.548903 0.006247 0.025766 43 bass
8896 0.013378 1185 -58.515682 -79.649185 3.395107e-05 -0.000033 205.533172 7.821560 1.213353 0.501567 ... 0.075102 0.020115 -18.010260 0.071818 0.034119 -16.802776 0.003094 0.043800 46 mallet
1830 0.004121 319 -14.570759 -44.845161 -1.256410e-02 0.000045 84.263868 -2.796468 1.489708 0.724999 ... 0.027695 0.002246 9.528914 0.063975 0.017251 10.114882 -0.099881 -0.009263 77 bass
12077 0.018680 1650 -36.952751 -68.855492 5.404165e-07 0.000111 444.422995 5.354834 0.846387 0.067997 ... 0.005530 -0.016946 -5.744551 0.008136 -0.012692 -4.648269 0.050792 -0.009209 39 string

5 rows × 89 columns

In [161]:
keyboard = X_test_note[X_test_note['Instrument'] == 'keyboard']
In [163]:
staff = keyboard[(keyboard['Note'] < 80) & (keyboard['Note'] > 40)]
In [164]:
len(staff)
Out[164]:
193
In [166]:
staff.index=range(1,194) 
In [168]:
plt.figure(figsize=(16,8), dpi=200)
plt.scatter(staff.index, staff['Note'])
plt.xlim(0,50)
plt.axhline(y=64, color='black', linestyle='-')
plt.axhline(y=67, color='black', linestyle='-')
plt.axhline(y=71, color='black', linestyle='-')
plt.axhline(y=74, color='black', linestyle='-')
plt.axhline(y=77, color='black', linestyle='-')
plt.axhline(y=43, color='black', linestyle='-')
plt.axhline(y=47, color='black', linestyle='-')
plt.axhline(y=50, color='black', linestyle='-')
plt.axhline(y=53, color='black', linestyle='-')
plt.axhline(y=57, color='black', linestyle='-');